Experiment was done in three batches: Batch1 samples: NM05N, NM04N1, NH64T Batch2 samples: NH87ND, NH87T, HCM-CSHL-0655-C50 Batch3 samples: multiplexed 4 samples with the following barcodes GACAGTGC HCM-CSHL-0366-C50 GAGTTAGC NH85TSc GATGAATC NH95T GCCAAGAC NH93T
10x Single cell RNA-seq analysis -- pre-processing using cell ranger -- aligned to grch38
load packages
library(Seurat)
library(dplyr)
library(Matrix)
library(magrittr)
library(future.apply)
library(cowplot)
library(hdf5r)
library(stringr)
library(ggplot2)
getwd()
NH64T= Read10X_h5("/Users/sonambhatia/Documents_local/tnbc_manuscript_r_scripts/data/Bhatia_03-NH64T_filtered_feature_bc_matrix.h5",use.names=TRUE, unique.features = TRUE)
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x =
## counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
## read SB04 remapped to grch38
NH87T= Read10X_h5("/Users/sonambhatia/Documents_local/tnbc_manuscript_r_scripts/data/Bhatia_04_10xGEX_NH87TT_filtered_feature_bc_matrix.h5")
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x =
## counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
`HCM-CSHL-0655-C50`= Read10X_h5("/Users/sonambhatia/Documents_local/tnbc_manuscript_r_scripts/data/Bhatia_04_10xGEX_HC_0655_filtered_feature_bc_matrix.h5")
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x =
## counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
## read SB06 mapped to grch38
sb06_data= Read10X_h5("/Users/sonambhatia/Documents_local/tnbc_manuscript_r_scripts/data/Bhatia_06_superloaded10xGEX_filtered_feature_bc_matrix.h5",use.names=TRUE, unique.features = TRUE)
## Warning in sparseMatrix(i = indices[] + 1, p = indptr[], x = as.numeric(x =
## counts[]), : 'giveCsparse' has been deprecated; setting 'repr = "T"' for you
## make Seurat objects
sb06_obj= CreateSeuratObject(sb06_data, min.cells = 3, min.features = 200, project = "sb06_4x")
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
# barcodes from Vireo
library(readr)
barcode_key <- read_csv("~/Documents/HumanOrganoid/Bioinformatics/scRNA-seq/Data_from_JPreall/SB06/demultiplex_vireo/barcode_key.csv",
col_names = FALSE)
donor_ids <- read_delim("~/Documents/HumanOrganoid/Bioinformatics/scRNA-seq/Data_from_JPreall/SB06/demultiplex_vireo/vireo/donor_ids.tsv",
"\t", escape_double = FALSE, trim_ws = TRUE)
#GACAGTGC HCM-CSHL-0366-C50
#GAGTTAGC NH85TSc
#GATGAATC NH95T
#GCCAAGAC NH93T
donor_ids$organoid_id= donor_ids$donor_id
`HCM-CSHL-0366-C50_cells` = grep("GACAGTGC",donor_ids$organoid_id)
donor_ids$organoid_id[`HCM-CSHL-0366-C50_cells`]="HCM-CSHL-0366-C50"
NH85TSc_cells = grep("GAGTTAGC",donor_ids$organoid_id)
donor_ids$organoid_id[NH85TSc_cells]="NH85TSc"
NH95TT_cells = grep("GATGAATC",donor_ids$organoid_id)
donor_ids$organoid_id[NH95TT_cells]="NH95T"
NH93T_cells = grep("GCCAAGAC",donor_ids$organoid_id)
donor_ids$organoid_id[NH93T_cells]="NH93T"
summary(as.factor(donor_ids$organoid_id))
## add barcode labels to metadata
tail(donor_ids)
tail(rownames(sb06_obj@meta.data))
myBarcode = rownames(sb06_obj@meta.data) #get barcode from seurat
test = donor_ids[match(myBarcode, donor_ids$cell), ] #match the order of seurat barcode with you data
sb06_obj$vireo_id = test$organoid_id #put cell type into metadata
sb06_obj$vireo_id
## make Seurat objects
nh64t_obj= CreateSeuratObject(NH64T, min.cells = 3, min.features = 200, project = "NH64T")
nh87tt_obj= CreateSeuratObject(NH87T, min.cells = 3, min.features = 200, project = "NH87T")
hc_0655_obj= CreateSeuratObject(`HCM-CSHL-0655-C50`, min.cells = 3, min.features = 200, project = "HCM-CSHL-0655-C50")
# Check the metadata in the new Seurat objects
head(hc_0655_obj@meta.data)
head(sb06_obj@meta.data)
sb06_obj@meta.data$test= sb06_obj@meta.data$vireo_id
# set the orig.ident of sb06 as the patient id
sb06_obj@meta.data$orig.ident= sb06_obj@meta.data$vireo_id
head(sb06_obj@meta.data)
##
my.list= Filter(function(x) is(x, "Seurat"), mget(ls()))
names(my.list)
# add tumor vs normal in the metadata as a stimulus for the samples
my.list$nh64t_obj$stim ="tumor"
my.list$hc_0655_obj$stim ="tumor"
my.list$nh87tt_obj$stim ="tumor"
my.list$sb06_obj$stim ="tumor"
*merge seurat objects sources: https://github.com/hbctraining/scRNA-seq/blob/master/lessons/03_SC_quality_control-setup.md https://satijalab.org/seurat/v3.0/merge_vignette.html
merged_seurat <- merge(nh64t_obj, y = c(hc_0655_obj,nh87tt_obj,sb06_obj),
add.cell.ids = c("nh64t_obj","hc_0655_obj","nh87tt_obj","sb06_obj"), project = "merged_all",
merge.data = TRUE)
# Check that the merged object has the appropriate sample-specific prefixes
head(merged_seurat@meta.data)
tail(merged_seurat@meta.data)
# Explore merged metadata
# View(merged_seurat@meta.data)
# Add number of genes per UMI for each cell to metadata
merged_seurat$log10GenesPerUMI <- log10(merged_seurat$nFeature_RNA) / log10(merged_seurat$nCount_RNA)
# Compute percent mito ratio
merged_seurat$mitoRatio <- PercentageFeatureSet(object = merged_seurat, pattern = "^MT-")
merged_seurat$mitoRatio <- merged_seurat@meta.data$mitoRatio / 100
View(merged_seurat@meta.data)
# Create metadata dataframe
metadata <- merged_seurat@meta.data
# Add cell IDs to metadata
metadata$cells <- rownames(metadata)
# Rename columns
metadata <- metadata %>%
dplyr::rename(seq_folder = orig.ident,
nUMI = nCount_RNA,
nGene = nFeature_RNA)
# Create sample column
metadata$sample <- NA
metadata$sample[which(str_detect(metadata$seq_folder, "HCM-CSHL-0655-C50"))] <- "tumor"
metadata$sample[which(str_detect(metadata$seq_folder, "NH87T"))] <- "tumor"
metadata$sample[which(str_detect(metadata$seq_folder, "NH64T"))] <- "tumor"
metadata$sample[which(str_detect(metadata$seq_folder, "NH93T"))] <- "tumor"
metadata$sample[which(str_detect(metadata$seq_folder, "NH95T"))] <- "tumor"
metadata$sample[which(str_detect(metadata$seq_folder, "NH85TSc"))] <- "tumor"
metadata$sample[which(str_detect(metadata$seq_folder, "HCM-CSHL-0366-C50"))] <- "tumor"
# Add metadata back to Seurat object
merged_seurat@meta.data <- metadata
# Create .RData object to load at any time
save(merged_seurat, file="merged_seurat_tumor.RData")
#pdf("tumor_total_cell_numbers.#pdf")
metadata %>%
ggplot(aes(x=sample, fill=sample)) +
geom_bar() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
ggtitle("NCells")
#dev.off()
#pdf("tumor_total_cell_numbers_sample_type.#pdf")
metadata %>%
ggplot(aes(x=seq_folder, fill=seq_folder)) +
geom_bar() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
ggtitle("NCells")
#dev.off()
# Visualize the number UMIs/transcripts per cell
#pdf("tumor_total_cell_numbers_UMI_per_cell.#pdf")
metadata %>%
ggplot(aes(color=sample, x=nUMI, fill= sample)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
ylab("Cell density") +
geom_vline(xintercept = 500)
#dev.off()
#pdf("tumor_total_cell_numbers_UMI_per_cell_type.#pdf")
metadata %>%
ggplot(aes(color=seq_folder, x=nUMI, fill= seq_folder)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
ylab("Cell density") +
geom_vline(xintercept = 500)
#dev.off()
# Visualize the distribution of genes detected per cell via histogram
#pdf("tumor_total_cell_numbers_genes_per_cell.#pdf")
metadata %>%
ggplot(aes(color=sample, x=nGene, fill= sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = 300)
#dev.off()
# Visualize the distribution of genes detected per cell via boxplot
#pdf("tumor_total_cell_numbers_genes_per_cell_type.#pdf")
metadata %>%
ggplot(aes(x=seq_folder, y=log10(nGene), fill=seq_folder)) +
geom_boxplot() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
ggtitle("NCells vs NGenes")
#dev.off()
# Visualize the correlation between genes detected and number of UMIs
# Are there cells present with low numbers of genes/UMIs?
#pdf("tumor_correlation_genes_vs_num_of_UMIs.#pdf")
metadata %>%
ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) +
geom_point() +
scale_colour_gradient(low = "gray90", high = "black") +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = 500) +
geom_hline(yintercept = 250) +
facet_wrap(~sample)
## `geom_smooth()` using formula 'y ~ x'
#dev.off()
#pdf("tumor_correlation_genes_vs_num_of_UMIs_celltype.#pdf")
metadata %>%
ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) +
geom_point() +
scale_colour_gradient(low = "gray90", high = "black") +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = 500) +
geom_hline(yintercept = 250) +
facet_wrap(~seq_folder)
## `geom_smooth()` using formula 'y ~ x'
#dev.off()
# Visualize the distribution of mitochondrial gene expression detected per cell
#pdf("tumor_correlation_MTgenes_vs_cell.#pdf")
metadata %>%
ggplot(aes(color=sample, x=mitoRatio, fill=sample)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
geom_vline(xintercept = 0.2)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 194 rows containing non-finite values (stat_density).
#dev.off()
#pdf("tumor_correlation_MTgenes_vs_cell_celltype.#pdf")
metadata %>%
ggplot(aes(color=seq_folder, x=mitoRatio, fill=seq_folder)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
geom_vline(xintercept = 0.2)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 194 rows containing non-finite values (stat_density).
#dev.off()
# Visualize the overall complexity of the gene expression by visualizing the genes detected per UMI
#pdf("tumor_complexity_genes_per_UMI.#pdf")
metadata %>%
ggplot(aes(x=log10GenesPerUMI, color = sample, fill=sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
geom_vline(xintercept = 0.8)
#dev.off()
#pdf("tumor_complexity_genes_per_UMI_celltype.#pdf")
metadata %>%
ggplot(aes(x=log10GenesPerUMI, color = seq_folder, fill=seq_folder)) +
geom_density(alpha = 0.2) +
theme_classic() +
geom_vline(xintercept = 0.8)
#dev.off()
use the merged and filtered object for all downstream analysis
filtered_seurat <- subset(x = merged_seurat,
subset= (nUMI >= 500) &
(nGene >= 250) &
(log10GenesPerUMI > 0.80) &
(mitoRatio < 0.20))
# Output a logical vector for every gene on whether the more than zero counts per cell
# Extract counts
counts <- GetAssayData(object = filtered_seurat, slot = "counts")
# Output a logical vector for every gene on whether the more than zero counts per cell
nonzero <- counts > 0
# Sums all TRUE values and returns TRUE if more than 10 TRUE values per gene
keep_genes <- Matrix::rowSums(nonzero) >= 10
# Only keeping those genes expressed in more than 10 cells
filtered_counts <- counts[keep_genes, ]
# Reassign to filtered Seurat object
filtered_seurat <- CreateSeuratObject(filtered_counts, meta.data = filtered_seurat@meta.data)
# Save filtered subset to new metadata
metadata_clean <- filtered_seurat@meta.data
### Plot after filtering
#pdf("tumor_complexity_genes_per_UMI_FILTERED.#pdf")
metadata_clean %>%
ggplot(aes(x=log10GenesPerUMI, color = sample, fill=sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
geom_vline(xintercept = 0.8)
#dev.off()
#pdf("tumor_correlation_MTgenes_vs_cell_FILTERED.#pdf")
metadata_clean %>%
ggplot(aes(color=sample, x=mitoRatio, fill=sample)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
theme_classic() +
geom_vline(xintercept = 0.2)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 194 rows containing non-finite values (stat_density).
#dev.off()
#pdf("tumor_correlation_genes_vs_num_of_UMIs_FILTERED.#pdf")
metadata_clean %>%
ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) +
geom_point() +
scale_colour_gradient(low = "gray90", high = "black") +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = 500) +
geom_hline(yintercept = 250) +
facet_wrap(~sample)
## `geom_smooth()` using formula 'y ~ x'
#dev.off()
#pdf("tumor_correlation_genes_vs_num_of_UMIs_celltype_FILTERED.#pdf")
metadata_clean %>%
ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) +
geom_point() +
scale_colour_gradient(low = "gray90", high = "black") +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = 500) +
geom_hline(yintercept = 250) +
facet_wrap(~seq_folder)
## `geom_smooth()` using formula 'y ~ x'
#dev.off()
#pdf("tumor_total_cell_numbers_genes_per_cell_FILTERED.#pdf")
metadata_clean %>%
ggplot(aes(color=sample, x=nGene, fill= sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = 300)
#dev.off()
#pdf("tumor_correlation_genes_vs_num_of_UMIs_celltype_FILTERED.#pdf")
metadata_clean %>%
ggplot(aes(x=nUMI, y=nGene, color=mitoRatio)) +
geom_point() +
scale_colour_gradient(low = "gray90", high = "black") +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = 500) +
geom_hline(yintercept = 250) +
facet_wrap(~seq_folder)
## `geom_smooth()` using formula 'y ~ x'
#dev.off()
#pdf("tumor_total_cell_numbers_genes_per_cell_type_FILTERED.#pdf")
metadata_clean %>%
ggplot(aes(x=seq_folder, y=log10(nGene), fill=seq_folder)) +
geom_boxplot() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold")) +
ggtitle("NCells vs NGenes")
#dev.off()
# Create .RData object to load at any time
save(filtered_seurat, file="seurat_filtered_tumor.RData")
saveRDS(filtered_seurat, file="seurat_filtered_tumor.rds")
Session info
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] readr_1.4.0 ggplot2_3.3.3 stringr_1.4.0 hdf5r_1.3.3
## [5] cowplot_1.1.1 future.apply_1.7.0 future_1.21.0 magrittr_2.0.1
## [9] Matrix_1.3-2 dplyr_1.0.5 Seurat_3.2.3 knitr_1.31
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 colorspace_2.0-0 deldir_0.2-10
## [4] ellipsis_0.3.1 ggridges_0.5.3 rstudioapi_0.13
## [7] spatstat.data_2.0-0 farver_2.1.0 leiden_0.3.7
## [10] listenv_0.8.0 ggrepel_0.9.1 bit64_4.0.5
## [13] fansi_0.4.2 codetools_0.2-18 splines_3.6.3
## [16] polyclip_1.10-0 jsonlite_1.7.2 ica_1.0-2
## [19] cluster_2.1.1 png_0.1-7 uwot_0.1.10
## [22] shiny_1.6.0 sctransform_0.3.2 compiler_3.6.3
## [25] httr_1.4.2 assertthat_0.2.1 fastmap_1.1.0
## [28] lazyeval_0.2.2 cli_2.3.1 later_1.1.0.1
## [31] htmltools_0.5.1.1 tools_3.6.3 rsvd_1.0.3
## [34] igraph_1.2.6 gtable_0.3.0 glue_1.4.2
## [37] RANN_2.6.1 reshape2_1.4.4 Rcpp_1.0.6
## [40] spatstat_1.64-1 scattermore_0.7 jquerylib_0.1.3
## [43] vctrs_0.3.6 nlme_3.1-152 lmtest_0.9-38
## [46] xfun_0.21 globals_0.14.0 mime_0.10
## [49] miniUI_0.1.1.1 lifecycle_1.0.0 irlba_2.3.3
## [52] goftest_1.2-2 MASS_7.3-53.1 zoo_1.8-9
## [55] scales_1.1.1 hms_1.0.0 promises_1.2.0.1
## [58] spatstat.utils_2.0-0 parallel_3.6.3 RColorBrewer_1.1-2
## [61] yaml_2.2.1 reticulate_1.18 pbapply_1.4-3
## [64] gridExtra_2.3 sass_0.3.1 rpart_4.1-15
## [67] stringi_1.5.3 highr_0.8 rlang_0.4.10
## [70] pkgconfig_2.0.3 matrixStats_0.58.0 evaluate_0.14
## [73] lattice_0.20-41 ROCR_1.0-11 purrr_0.3.4
## [76] tensor_1.5 labeling_0.4.2 patchwork_1.1.1
## [79] htmlwidgets_1.5.3 bit_4.0.4 tidyselect_1.1.0
## [82] parallelly_1.23.0 RcppAnnoy_0.0.18 plyr_1.8.6
## [85] R6_2.5.0 generics_0.1.0 DBI_1.1.1
## [88] pillar_1.5.1 withr_2.4.1 mgcv_1.8-34
## [91] fitdistrplus_1.1-3 survival_3.2-7 abind_1.4-5
## [94] tibble_3.1.0 crayon_1.4.1 KernSmooth_2.23-18
## [97] utf8_1.1.4 plotly_4.9.3 rmarkdown_2.7
## [100] grid_3.6.3 data.table_1.14.0 digest_0.6.27
## [103] xtable_1.8-4 tidyr_1.1.3 httpuv_1.5.5
## [106] munsell_0.5.0 viridisLite_0.3.0 bslib_0.2.4